Sunah Park

This markdown file is created by Sunah Park for extended lab exercises in the book An Introduction to Statistical Learning with Applications in R (ISLR).


Setup for code chunks

rm(list=ls())
# default r markdown global options in document
knitr::opts_chunk$set(
   ########## Text results ##########
    echo=TRUE, 
    warning=FALSE, # to preserve warnings in the output 
    error=FALSE, # to preserve errors in the output
    message=FALSE, # to preserve messages
    strip.white=TRUE, # to remove the white lines in the beginning or end of a source chunk in the output 

    ########## Cache ##########
    cache=TRUE,
   
    ########## Plots ##########
    fig.path="", # prefix to be used for figure filenames
    fig.width=8,
    fig.height=6,
    dpi=200
)

library(ISLR)
library(tree) # Tree library to construct classification and regression trees
library(ggplot2)
library(MASS)
library(randomForest)
library(gbm) # Gradient Boosted Machine
library(caret) # confusionmatrix

1. Classification Tree

head(Carseats,3); dim(Carseats)
##   Sales CompPrice Income Advertising Population Price ShelveLoc Age
## 1  9.50       138     73          11        276   120       Bad  42
## 2 11.22       111     48          16        260    83      Good  65
## 3 10.06       113     35          10        269    80    Medium  59
##   Education Urban  US
## 1        17   Yes Yes
## 2        10   Yes Yes
## 3        12   Yes Yes
## [1] 400  11
High<-ifelse(Carseats$Sales<=8, "No","Yes")
Carseats<-data.frame(Carseats,High)
tree.carseats<-tree(High~.-Sales, data=Carseats) # Fit a classification tree in order to predict High using all variables but Sales
summary(tree.carseats)
## 
## Classification tree:
## tree(formula = High ~ . - Sales, data = Carseats)
## Variables actually used in tree construction:
## [1] "ShelveLoc"   "Price"       "Income"      "CompPrice"   "Population" 
## [6] "Advertising" "Age"         "US"         
## Number of terminal nodes:  27 
## Residual mean deviance:  0.4575 = 170.7 / 373 
## Misclassification error rate: 0.09 = 36 / 400
plot(tree.carseats)
text(tree.carseats, pretty=0, cex=.7) # pretty=0 to include the category names for any qualitative predictors

tree.carseats
## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 400 541.500 No ( 0.59000 0.41000 )  
##     2) ShelveLoc: Bad,Medium 315 390.600 No ( 0.68889 0.31111 )  
##       4) Price < 92.5 46  56.530 Yes ( 0.30435 0.69565 )  
##         8) Income < 57 10  12.220 No ( 0.70000 0.30000 )  
##          16) CompPrice < 110.5 5   0.000 No ( 1.00000 0.00000 ) *
##          17) CompPrice > 110.5 5   6.730 Yes ( 0.40000 0.60000 ) *
##         9) Income > 57 36  35.470 Yes ( 0.19444 0.80556 )  
##          18) Population < 207.5 16  21.170 Yes ( 0.37500 0.62500 ) *
##          19) Population > 207.5 20   7.941 Yes ( 0.05000 0.95000 ) *
##       5) Price > 92.5 269 299.800 No ( 0.75465 0.24535 )  
##        10) Advertising < 13.5 224 213.200 No ( 0.81696 0.18304 )  
##          20) CompPrice < 124.5 96  44.890 No ( 0.93750 0.06250 )  
##            40) Price < 106.5 38  33.150 No ( 0.84211 0.15789 )  
##              80) Population < 177 12  16.300 No ( 0.58333 0.41667 )  
##               160) Income < 60.5 6   0.000 No ( 1.00000 0.00000 ) *
##               161) Income > 60.5 6   5.407 Yes ( 0.16667 0.83333 ) *
##              81) Population > 177 26   8.477 No ( 0.96154 0.03846 ) *
##            41) Price > 106.5 58   0.000 No ( 1.00000 0.00000 ) *
##          21) CompPrice > 124.5 128 150.200 No ( 0.72656 0.27344 )  
##            42) Price < 122.5 51  70.680 Yes ( 0.49020 0.50980 )  
##              84) ShelveLoc: Bad 11   6.702 No ( 0.90909 0.09091 ) *
##              85) ShelveLoc: Medium 40  52.930 Yes ( 0.37500 0.62500 )  
##               170) Price < 109.5 16   7.481 Yes ( 0.06250 0.93750 ) *
##               171) Price > 109.5 24  32.600 No ( 0.58333 0.41667 )  
##                 342) Age < 49.5 13  16.050 Yes ( 0.30769 0.69231 ) *
##                 343) Age > 49.5 11   6.702 No ( 0.90909 0.09091 ) *
##            43) Price > 122.5 77  55.540 No ( 0.88312 0.11688 )  
##              86) CompPrice < 147.5 58  17.400 No ( 0.96552 0.03448 ) *
##              87) CompPrice > 147.5 19  25.010 No ( 0.63158 0.36842 )  
##               174) Price < 147 12  16.300 Yes ( 0.41667 0.58333 )  
##                 348) CompPrice < 152.5 7   5.742 Yes ( 0.14286 0.85714 ) *
##                 349) CompPrice > 152.5 5   5.004 No ( 0.80000 0.20000 ) *
##               175) Price > 147 7   0.000 No ( 1.00000 0.00000 ) *
##        11) Advertising > 13.5 45  61.830 Yes ( 0.44444 0.55556 )  
##          22) Age < 54.5 25  25.020 Yes ( 0.20000 0.80000 )  
##            44) CompPrice < 130.5 14  18.250 Yes ( 0.35714 0.64286 )  
##              88) Income < 100 9  12.370 No ( 0.55556 0.44444 ) *
##              89) Income > 100 5   0.000 Yes ( 0.00000 1.00000 ) *
##            45) CompPrice > 130.5 11   0.000 Yes ( 0.00000 1.00000 ) *
##          23) Age > 54.5 20  22.490 No ( 0.75000 0.25000 )  
##            46) CompPrice < 122.5 10   0.000 No ( 1.00000 0.00000 ) *
##            47) CompPrice > 122.5 10  13.860 No ( 0.50000 0.50000 )  
##              94) Price < 125 5   0.000 Yes ( 0.00000 1.00000 ) *
##              95) Price > 125 5   0.000 No ( 1.00000 0.00000 ) *
##     3) ShelveLoc: Good 85  90.330 Yes ( 0.22353 0.77647 )  
##       6) Price < 135 68  49.260 Yes ( 0.11765 0.88235 )  
##        12) US: No 17  22.070 Yes ( 0.35294 0.64706 )  
##          24) Price < 109 8   0.000 Yes ( 0.00000 1.00000 ) *
##          25) Price > 109 9  11.460 No ( 0.66667 0.33333 ) *
##        13) US: Yes 51  16.880 Yes ( 0.03922 0.96078 ) *
##       7) Price > 135 17  22.070 No ( 0.64706 0.35294 )  
##        14) Income < 46 6   0.000 No ( 1.00000 0.00000 ) *
##        15) Income > 46 11  15.160 Yes ( 0.45455 0.54545 ) *

We see that the training error rate is 9%.

The most important indicator of Sales appears to be shelving location(ShelveLoc), since the first branch differentiates Good locations from Bad and Medium locations. By typing tree.carseats R displays the split criterion, the number of observations in that branch, the deviance, the overall prediction for the branch, and the fraction of observations in that branch that take on values of Yes or No. Branches that lead to terminal nodes are indicated using asterisks.

set.seed(2)
train<-sample(1:nrow(Carseats),200) #splitting
High.test<-High[-train]

tree.carseats<-tree(High~.-Sales, data=Carseats[train,])
tree.pred<-predict(tree.carseats, Carseats[-train,], type="class") # The argument type="class" instructs R to return the actual class prediction.
table(tree.pred, High.test)
##          High.test
## tree.pred No Yes
##       No  86  27
##       Yes 30  57
accuracy<-(86+57)/(86+27+30+57); accuracy
## [1] 0.715
confusionMatrix(tree.pred, as.factor(High.test)) # from the caret library
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  86  27
##        Yes 30  57
##                                           
##                Accuracy : 0.715           
##                  95% CI : (0.6471, 0.7764)
##     No Information Rate : 0.58            
##     P-Value [Acc > NIR] : 5.355e-05       
##                                           
##                   Kappa : 0.4179          
##  Mcnemar's Test P-Value : 0.7911          
##                                           
##             Sensitivity : 0.7414          
##             Specificity : 0.6786          
##          Pos Pred Value : 0.7611          
##          Neg Pred Value : 0.6552          
##              Prevalence : 0.5800          
##          Detection Rate : 0.4300          
##    Detection Prevalence : 0.5650          
##       Balanced Accuracy : 0.7100          
##                                           
##        'Positive' Class : No              
## 

This approach leads to correct predictions for around 71.5% of the locations in the test data set.

We now consider whether pruning the tree might lead to improved results. The function cv.tree() performs cross-validation in order to determine the optimal level of tree complexity; cost complexity pruning is used in order to select a sequence of trees for consideration. We use the argument FUN=prune.misclass in order to indicate that we want the classification error rate to guide the cross-validation and pruning process, rather than the default for the cv.tree() function, which is deviance.

The cv.tree() function reports the number of terminal nodes of each tree considered(size) as well as the corresponding error rate and the value of the cost-complexity parameter used (k).

set.seed(3)
cv.carseats<-cv.tree(tree.carseats, FUN=prune.misclass, K=10) # FUN: The function to do the pruning(prune.missclass: classification error rate), K: The number of folds of the CV.
names(cv.carseats)
## [1] "size"   "dev"    "k"      "method"
cv.carseats
## $size
## [1] 19 17 14 13  9  7  3  2  1
## 
## $dev
## [1] 55 55 53 52 50 56 69 65 80
## 
## $k
## [1]       -Inf  0.0000000  0.6666667  1.0000000  1.7500000  2.0000000
## [7]  4.2500000  5.0000000 23.0000000
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
par(mfrow=c(1,2))
plot(cv.carseats$size, cv.carseats$dev, type="b")
plot(cv.carseats$k, cv.carseats$dev, type="b")

dev corresponds to the cross-validation error rate. The tree with 9 terminal nodes results in the lowest cross-validation error rate with 50 cross-validation errors.

prune.carseats<-prune.misclass(tree.carseats, best=cv.carseats$size[which.min(cv.carseats$dev)]) # prune.misclass() function in order to prune the tree to obtain the nine-node tree, prune.misclass=prune.tree(method="misclass")
plot(prune.carseats)
text(prune.carseats, pretty=0)

tree.pred<-predict(prune.carseats, Carseats[-train,], type="class")
table(tree.pred, High.test)
##          High.test
## tree.pred No Yes
##       No  94  24
##       Yes 22  60
accuracy<-(94+60)/(94+24+22+60); accuracy
## [1] 0.77
confusionMatrix(tree.pred, as.factor(High.test))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  94  24
##        Yes 22  60
##                                           
##                Accuracy : 0.77            
##                  95% CI : (0.7054, 0.8264)
##     No Information Rate : 0.58            
##     P-Value [Acc > NIR] : 1.341e-08       
##                                           
##                   Kappa : 0.5264          
##  Mcnemar's Test P-Value : 0.8828          
##                                           
##             Sensitivity : 0.8103          
##             Specificity : 0.7143          
##          Pos Pred Value : 0.7966          
##          Neg Pred Value : 0.7317          
##              Prevalence : 0.5800          
##          Detection Rate : 0.4700          
##    Detection Prevalence : 0.5900          
##       Balanced Accuracy : 0.7623          
##                                           
##        'Positive' Class : No              
## 

Now 77% of the test observations are correctly classified, so not only has the pruning process produced a more interpretable tree, but it has also improved the classification accuracy.

prune.carseats<-prune.misclass(tree.carseats, best=15) # increased value of best 
plot(prune.carseats)
text(prune.carseats, pretty=0)

tree.pred<-predict(prune.carseats, Carseats[-train,],type="class")
table(tree.pred, High.test)
##          High.test
## tree.pred No Yes
##       No  86  22
##       Yes 30  62
accuracy<-(86+62)/(86+22+30+62); accuracy
## [1] 0.74
confusionMatrix(tree.pred, as.factor(High.test))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  86  22
##        Yes 30  62
##                                           
##                Accuracy : 0.74            
##                  95% CI : (0.6734, 0.7993)
##     No Information Rate : 0.58            
##     P-Value [Acc > NIR] : 1.802e-06       
##                                           
##                   Kappa : 0.4733          
##  Mcnemar's Test P-Value : 0.3317          
##                                           
##             Sensitivity : 0.7414          
##             Specificity : 0.7381          
##          Pos Pred Value : 0.7963          
##          Neg Pred Value : 0.6739          
##              Prevalence : 0.5800          
##          Detection Rate : 0.4300          
##    Detection Prevalence : 0.5400          
##       Balanced Accuracy : 0.7397          
##                                           
##        'Positive' Class : No              
## 

We obtain a larger pruned tree with lower classification accuracy(74%).


2. Regression Tree

set.seed(1)
train<-sample(1:nrow(Boston), nrow(Boston)/2)
tree.boston<-tree(medv~., Boston[train,])
summary(tree.boston)
## 
## Regression tree:
## tree(formula = medv ~ ., data = Boston[train, ])
## Variables actually used in tree construction:
## [1] "lstat" "rm"    "dis"  
## Number of terminal nodes:  8 
## Residual mean deviance:  12.65 = 3099 / 245 
## Distribution of residuals:
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -14.10000  -2.04200  -0.05357   0.00000   1.96000  12.60000
plot(tree.boston)
text(tree.boston, pretty=0, cex=.7)

We now use the cv.tree() function to see whether pruning the tree will improve performance.

cv.boston<-cv.tree(tree.boston, FUN=prune.tree, K=10)
par(mfrow=c(1,2))
plot(cv.boston$size, cv.boston$dev, type="b")
# In this case, the more complex tree is selected by cross-validation. 
plot(cv.boston$k, cv.boston$dev, type="b")

par(mfrow=c(1,1))
prune.boston<-prune.tree(tree.boston, best=5) # We still want to prune the tree
plot(prune.boston)
text(prune.boston, pretty=0, cex=.7)

## MSE of Pruned tree
tree.pred<-predict(prune.boston, newdata=Boston[-train,])
boston.test<-Boston[-train, "medv"]

plot(tree.pred, boston.test)
abline(0,1)

mean((tree.pred-boston.test)^2)
## [1] 26.83413
## MSE of Unpruned tree
tree.pred<-predict(tree.boston, newdata=Boston[-train,])
boston.test<-Boston[-train, "medv"]

plot(tree.pred, boston.test)
abline(0,1)

mean((tree.pred-boston.test)^2)
## [1] 25.04559

The test set MSE associated with the regression tree is 25.05. The square root of the MSE is therefore around 5.005, indicating that this model leads to test predictions that are within around $5,005 of the true median home value for the suburb. Note that the MSE of the unpruned tree is smaller than that of pruned tree, which can be explained from the cross-validation error plot.


3. Bagging and Random Forests

Bagging is simply a special case of a random forest with m=p. Therefore, the randomForest() function can be used to perform both random forests and bagging.

set.seed(1)
train<-sample(1:nrow(Boston), nrow(Boston)/2)

Bagging

set.seed(1)
bag.boston<-randomForest(medv~., data=Boston[train,], 
                         mtry=13, # Number of variables randomly sampled as candidates at each split
                         ntree=25) # Number of trees to grow
bag.boston
## 
## Call:
##  randomForest(formula = medv ~ ., data = Boston[train, ], mtry = 13,      ntree = 25) 
##                Type of random forest: regression
##                      Number of trees: 25
## No. of variables tried at each split: 13
## 
##           Mean of squared residuals: 12.03488
##                     % Var explained: 85.43
bag.pred<-predict(bag.boston, newdata=Boston[-train,])
boston.test<-Boston[-train, "medv"]
plot(bag.pred, boston.test)

mean((bag.pred-boston.test)^2)
## [1] 14.40655

The argument mtry=13 indicates that all 13 predictors should be considered for each split of the tree - in other words, that bagging should be done.

The test set MSE associated with the bagged regression tree is almost half that obtained using an pruned single tree.

Random Forest

Growing a random forest proceeds in exactly the same way, except that we use a smaller value of the mtry argument. By default, randomForest() uses p/3 variables when building a random forest of regression trees, and sqrt(p) variables when building a random forest of classification trees. Here we use mtry=6.

set.seed(1)
rf.boston<-randomForest(medv~., data=Boston[train,], 
                         mtry=6, # Number of variables randomly sampled as candidates at each split
                         ntree=25,
                         importance=TRUE) # Number of trees to grow
rf.boston
## 
## Call:
##  randomForest(formula = medv ~ ., data = Boston[train, ], mtry = 6,      ntree = 25, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 25
## No. of variables tried at each split: 6
## 
##           Mean of squared residuals: 15.96178
##                     % Var explained: 80.67
rf.pred<-predict(rf.boston, newdata=Boston[-train,])
boston.test<-Boston[-train, "medv"]
plot(rf.pred, boston.test)

mean((rf.pred-boston.test)^2)
## [1] 11.78977

The random forests yield an improvement over bagging in terms of MSE.

Using the importance function, we can view the importance of each variable. Two measures of variable importance are reported. The former is based upon the mean decrease of accuracy in predictions on the out of bag samples when a given variable is excluded from the model. The latter is a measure of the total increase in node impurity that results from splits over that variable, averaged over all trees. In the case of regression trees, the node impurity is measured by the training RSS, and for classification trees by the deviance. Plots of these importance measures can be produced using the varImpPlot() function.

importance(rf.boston)
##           %IncMSE IncNodePurity
## crim    3.4901635     704.78119
## zn      0.7377281      37.26717
## indus   2.3719798    1675.17900
## chas    0.9616988     130.11590
## nox     2.7436447    1336.04737
## rm      8.1295861    6942.74427
## age     2.2047824     736.01296
## dis     4.0057542     723.39989
## rad     1.1491627      83.85964
## tax     1.3745050     275.63541
## ptratio 2.3532122     805.43706
## black   0.9664194     253.93570
## lstat   8.2098280    6939.08139
varImpPlot(rf.boston)

The results indicate that across all of the trees considered in the random forest, the lstat (the wealth level of the community) and the house size(rm) are by far the two most important variables.


4. Boosting

We run gbm() with the option distribution=“gaussian” since this is a regression problem; if it were a binary classification problem, we would use distribution=“bernoulli”.

set.seed(1)
boost.boston<-gbm(medv~., data=Boston[train,], 
                  distribution="gaussian",
                  n.trees=5000, # number of trees (default:100)
                  interaction.depth=4) # the depth of each tree

summary(boost.boston)

##             var    rel.inf
## lstat     lstat 37.0661275
## rm           rm 25.3533123
## dis         dis 11.7903016
## crim       crim  8.0388750
## black     black  4.2531659
## nox         nox  3.5058570
## age         age  3.4868724
## ptratio ptratio  2.2500385
## indus     indus  1.7725070
## tax         tax  1.1836592
## chas       chas  0.7441319
## rad         rad  0.4274311
## zn           zn  0.1277206
boost.pred<-predict(boost.boston, newdata=Boston[-train,], 
                    n.trees=5000)
mean((boost.pred-boston.test)^2)
## [1] 10.81479
## shrinkage parameter (learning rate)
boost.boston<-gbm(medv~., data=Boston[train,], 
                  distribution="gaussian",
                  n.trees=5000, # number of trees (default:100)
                  interaction.depth=4, # the maximum depth of each tree (the highest level of variable interactions allowed). A value of 1 implies an additive model, a value of 2 implies a model with up to 2-way interactions. Default is 1.
                  shrinkage=0.2, # shrinkage parameter applied to each tree in the expansion. Also know as the learning rate or step-size reduction. A smaller learning rate typically requires more trees. Default is 0.1
                  verbose=F)

boost.pred<-predict(boost.boston, newdata=Boston[-train,], 
                    n.trees=5000)
mean((boost.pred-boston.test)^2)
## [1] 11.51109

We see that lstat and rm are by far the most important variables. We can also produce partial dependence plots for these two variables. These plots illustrate the marginal effect of the selected variables on the response after integrating out the other variables. The median house prices are increasing with rm and decreasing with lstat.

par(mfrow=c(1,2))
plot(boost.boston, i="rm")

plot(boost.boston, i="lstat")